import os
import gzip
import pybedtools


organism = 'human'
assembly = 'hg38'
name = "Homo_sapiens"

directory = "/osc-fs_home/mdehoon/Data/NCBI/"
filename = "genes.gff"
path = os.path.join(directory, assembly, filename)
handle = open(path)
line = next(handle)
assert line.startswith("##")
words = line.split()
assert words[0] == "##source-version"
assert words[1] == "NCBI-Entrez:%s.ags.gz" % name
assert words[2] == "2020.04.26"
lines = pybedtools.BedTool(handle)
regions = {}
offset = 500
for line in lines:
    assert line.fields[2] == 'transcript'
    chromosome = line.chrom
    if chromosome not in regions:
        regions[chromosome] = []
    strand = line.strand
    if strand == '+':
        tss = line.start
    else:
        tss = line.end - 1
    start = tss - offset
    end = tss + offset + 1
    start = max(start, 0)
    region = [start, end]
    regions[chromosome].append(region)
handle.close()

directory = "/osc-fs_home/mdehoon/Data/NCBI/"
filename = "exons.gff"
path = os.path.join(directory, assembly, filename)
handle = open(path)
line = next(handle)
assert line.startswith("##")
words = line.split()
assert words[0] == "##source-version"
assert words[1] == "NCBI-Entrez:%s.ags.gz" % name
assert words[2] == "2020.04.26"
lines = pybedtools.BedTool(handle)
offset = 200
for line in lines:
    chromosome = line.chrom
    if chromosome not in regions:
        regions[chromosome] = []
    strand = line.strand
    start = line.start - offset
    end = line.end + offset
    start = max(start, 0)
    region = [start, end]
    regions[chromosome].append(region)
handle.close()

mask = {}
for chromosome in regions:
    regions[chromosome].sort()
    region = regions[chromosome].pop(0)
    mask[chromosome] = [region]
    for region in regions[chromosome]:
        start, end = region
        previous = mask[chromosome][-1]
        if start > previous[1]:
            mask[chromosome].append(region)
        else:
           assert start >= previous[0]
           previous[1] = max(end, previous[1])

chromosomes = sorted(regions)
directory = "mask"
filename = "%s_neg_filter_500_merged.bed" % assembly

try:
    os.mkdir(directory)
except FileExistsError:
    pass

path = os.path.join(directory, filename)
print("Writing mask to %s" % path)
handle = open(path, 'w')
for chromosome in chromosomes:
    for region in mask[chromosome]:
        start, end = region
        fields = [chromosome, start, end]
        line = pybedtools.create_interval_from_list(fields)
        handle.write(str(line))
